In [4]:
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 200)
In [ ]:
Let's look at some tweets
In [2]:
print('Loading previously "cleaned" tweets (could take a minute or so)...')
df = pd.read_csv(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), index_col='id', compression='gzip',
quotechar='"', quoting=pd.io.common.csv.QUOTE_NONNUMERIC, low_memory=False)
print('Loaded {} tweets.'.format(len(df)))
df
Out[2]:
In [3]:
text = df.text.iloc[:10]
for tweet in text:
print()
print(tweet)
# print(repr(tweet))
So Even after subtracting "-Monty" in our search query, there are still a lot more meanings for Python than we intended
This is one of the key challenges of natural language procesing, "ambiguity"
There are a lot of names for dimension reduction techniques that attempt to determing meaning (semantics) from bag of words statistics (words used near each other)
SVD: Singular Value Decomposition
LDA: Linear Discriminant Analysis
In [ ]:
In [ ]: